Lab 13

Author

Abdullah Ahmad

library(tidyverse)
library(lubridate)
library(DT)
library(viridis)
library(janitor)
library(plotly)
library(respirometry)
library(visNetwork)

NEON MAG Community Composition, Diversity, and Quality Across Sites

This tutorial examines NEON MAGs across sites to summarize MAG richness and its relation to soil pH, taxonomic profiles by site, MAG quality (completeness and contamination), and the proportion of high quality MAGs per sample and site.

Step 1: Load and Prepare Data

df <- read_csv("NEON_soilMAGs_soilChem.csv")

head(df) 
# A tibble: 6 × 43
   ...1 genomicsSampleID soilMoisture decimalLatitude decimalLongitude elevation
  <dbl> <chr>                   <dbl>           <dbl>            <dbl>     <dbl>
1     1 BONA_001-O-2023…         1.55            65.2            -147.      374.
2     2 BONA_001-O-2023…         1.55            65.2            -147.      374.
3     3 BONA_001-O-2023…         1.55            65.2            -147.      374.
4     4 BONA_001-O-2023…         1.55            65.2            -147.      374.
5     5 BONA_001-O-2023…         1.55            65.2            -147.      374.
6     6 BONA_001-O-2023…         1.55            65.2            -147.      374.
# ℹ 37 more variables: soilTemp <dbl>, soilInWaterpH <dbl>, soilInCaClpH <dbl>,
#   bin_oid <chr>, bin_id <chr>, site <chr>, site_ID <chr>, subplot <chr>,
#   layer <chr>, quadrant <lgl>, date <dbl>, img_genome_id <dbl>,
#   bin_quality <chr>, bin_lineage <chr>, gtdb_taxonomy_lineage <chr>,
#   domain <chr>, phylum <chr>, class <chr>, order <chr>, family <chr>,
#   genus <chr>, bin_methods <chr>, created_by <chr>, date_added <date>,
#   bin_completeness <dbl>, bin_contamination <dbl>, average_coverage <lgl>, …

Step 2: MAG Richness per Site and Relation to Soil pH

Explanation: Count MAGs per sample and visualize whether samples with different pH have different numbers of recovered MAGs.

richness <- df |>
  group_by(genomicsSampleID) |>
  
# I am doing mean, just so I can get one value per sample ID, since the pH is the same for all of it anyways.
  summarise(n_mags = n_distinct(bin_id), 
            soilInWaterpH = mean(soilInWaterpH, na.rm = TRUE), 
            .groups = "drop"
) |>
  mutate(pH_group = case_when(!is.na(soilInWaterpH) & soilInWaterpH < 5.5 ~ "acidic",
                              !is.na(soilInWaterpH) ~ "neutral"))


head(richness)
# A tibble: 6 × 4
  genomicsSampleID    n_mags soilInWaterpH pH_group
  <chr>                <int>         <dbl> <chr>   
1 BONA_001-O-20230710     32          3.99 acidic  
2 BONA_002-O-20230711     31          4.35 acidic  
3 BONA_004-O-20230712     40          3.69 acidic  
4 BONA_006-O-20230710     29          4.05 acidic  
5 BONA_009-O-20230711     16          4.08 acidic  
6 BONA_013-O-20230710     31          3.97 acidic  
richness |>
ggplot(aes(x = pH_group, y = n_mags)) +
  geom_boxplot() + geom_jitter() +
  labs(
    x = "pH Group (<5.5 = acidic)",
    y = "MAG Richness (Distinct bin_id)",
    title = "MAG Richness per Sample and Relation to Soil pH"
  )

# Correlation test, 
cor_test <- cor.test(richness$n_mags, richness$soilInWaterpH, use="complete.obs")
print(cor_test)

    Pearson's product-moment correlation

data:  richness$n_mags and richness$soilInWaterpH
t = -4.8204, df = 291, p-value = 2.311e-06
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.3748380 -0.1624033
sample estimates:
       cor 
-0.2719304 

The correlation test shows that MAG richness is negatively correlated with sample pH (r = -0.2719, 95% CI = -0.3748 to -0.1624, n = 293, p = 2.31e-06).

Step 3: Taxonomic Profiling by Site

Explanation: Summarize phylum-level composition per site and plot stacked bars.

tax_profile <- df |>
  group_by(site_ID, phylum) |>
  summarise(count = n(), .groups = "drop") |>
  mutate(rel_abundance = count / sum(count))

ggplot(tax_profile, aes(x = site_ID, y = rel_abundance, fill = phylum)) +
  geom_bar(stat = "identity") +
  scale_fill_viridis_d(option = "turbo") +
  theme_minimal() +
  labs(title = "Taxonomic Profile by Site", y = "Relative Abundance") +
  theme(axis.text.x = element_text(angle = 60, size = 6, vjust = 1.2, hjust = 1), legend.position = "right")

Step 4: MAG Quality vs Soil Chemistry

Explanation: Visualize whether MAG quality/assembly metrics differ across soil gradients.

mag_meta <- df |>
  select(bin_id, genomicsSampleID, bin_completeness, bin_contamination, gene_count, scaffold_count, site_ID)

sample_env <- df |>
  distinct(genomicsSampleID, .keep_all = TRUE) |>
  select(genomicsSampleID, soilInWaterpH, soilMoisture, soilTemp, decimalLatitude, decimalLongitude, site_ID)

mag_env <- mag_meta |>
  left_join(sample_env, by = "genomicsSampleID")

head(mag_env)
# A tibble: 6 × 13
  bin_id          genomicsSampleID bin_completeness bin_contamination gene_count
  <chr>           <chr>                       <dbl>             <dbl>      <dbl>
1 3300079481_s0   BONA_001-O-2023…             95.6              4.96       5785
2 3300079481_s1   BONA_001-O-2023…             94.7              1.99       3054
3 3300079481_s101 BONA_001-O-2023…             51.8              9.42       3039
4 3300079481_s11  BONA_001-O-2023…             60.8              0.04       2539
5 3300079481_s15  BONA_001-O-2023…             81.5              0.18       3314
6 3300079481_s2   BONA_001-O-2023…             88.1              0.19       3753
# ℹ 8 more variables: scaffold_count <dbl>, site_ID.x <chr>,
#   soilInWaterpH <dbl>, soilMoisture <dbl>, soilTemp <dbl>,
#   decimalLatitude <dbl>, decimalLongitude <dbl>, site_ID.y <chr>
# Distribution graphs

p1 <- ggplot(mag_env, aes(x = bin_completeness)) + 
  geom_histogram(bins = 40, fill = "cornflowerblue", color = "white") + 
  theme_minimal() + labs(title = "Distribution of Bin Completeness",
                         x = "Bin Completeness",
                         y = "Count")

p2 <- ggplot(mag_env, aes(x = bin_contamination)) + 
  geom_histogram(bins = 40, fill = "tomato", color = "white") + 
  theme_minimal() + labs(title = "Distribution of Bin Contamination",
                         x = "Bin Contamination",
                         y = "Count")

p3 <- ggplot(mag_env, aes(x = scaffold_count)) + 
  geom_histogram(bins = 40, fill = "darkgreen", color = "white") + 
  theme_minimal() + labs(title = "Distribution of Scaffold Count",
                         x = "Scaffold Count",
                         y = "Count")

p4 <- ggplot(mag_env, aes(x = gene_count)) + 
  geom_histogram(bins = 40, fill = "purple", color = "white") + 
  theme_minimal() + labs(title = "Distribution of Gene Count",
                         x = "Gene Count",
                         y = "Count")

print(p1); print(p2); print(p3); print(p4)

Bin Completeness vs Soil pH

library(hexbin)

# Scatter Bin Completeness vs pH with linear fit
# I used hexbin because it was very hard to read with so many bin ID data points

p_comp_pH <- ggplot(mag_env, aes(x = soilInWaterpH, y = bin_completeness)) +
  stat_binhex(bins = 20) +
  scale_fill_viridis_c(option = "plasma", breaks = c(1, 5, 10, 20, 30, 40, 50),
                       labels = c("1", "5", "10", "20", "30", "40", "50")) +
  geom_smooth(method = "lm", se = TRUE, color = "white") +
  geom_point(alpha = 0) +
  theme_minimal() +
  labs(x = "pH",
       y = "Bin Completeness",
       title = "MAG Completeness vs Sample pH")
print(p_comp_pH)

# Correlation test for p_comp_pH

df2 <- mag_env %>%
  filter(!is.na(bin_completeness), !is.na(soilInWaterpH))
nrow(df2)  # sample size used
[1] 5534
pearson_res <- cor.test(df2$bin_completeness, df2$soilInWaterpH, method = "pearson")
print(pearson_res)   

    Pearson's product-moment correlation

data:  df2$bin_completeness and df2$soilInWaterpH
t = -13.118, df = 5532, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.1991263 -0.1480192
sample estimates:
       cor 
-0.1736897 
# Correlation test results show a weak correlation between bin completeness and soil pH.

Bin Contamination vs Soil pH

p_cont_pH <- ggplot(mag_env, aes(x = soilInWaterpH, y = bin_contamination)) +
  geom_point(alpha = 0.2, size = 1) +

  stat_binhex(bins = 20) +
  scale_fill_viridis_c(option = "plasma", breaks = c(5, 10, 20, 30, 40, 50),
                       labels = c("5", "10", "20", "30", "40", "50")) +
  geom_smooth(method = "lm", se = TRUE, color = "white") +
  theme_minimal() +
  labs(
    x = "pH",
    y = "Bin Contamination",
    title = "MAG Contamination vs Sample pH"
  ) 
print(p_cont_pH)

Step 5: Proportion of High Quality MAGs per Sample/Site

Explanation: Computing how many high quality MAGs come from each sample or site.

Creating the Tables

# This is to create the datasets for site and sample

num_of_HQ <- df |>
  mutate(is_HQ = as.integer(bin_quality == "HQ"))

sample_summary <- num_of_HQ |>
  group_by(genomicsSampleID, site_ID) |>
  summarise(
    n_MAGs = n_distinct(bin_id),
    n_HQ = sum(is_HQ, na.rm = TRUE),
    prop_HQ = n_HQ / pmax(n_MAGs, 1),
    med_comp = median(bin_completeness, na.rm = TRUE),
    med_cont = median(bin_contamination, na.rm = TRUE),
    .groups = "drop"
  ) 

site_summary <- sample_summary %>%
  group_by(site_ID) %>%
  summarise(
    n_samples  = n(),
    total_MAGs = sum(n_MAGs, na.rm = TRUE),
    total_HQ   = sum(n_HQ, na.rm = TRUE),
    prop_HQ    = total_HQ / pmax(total_MAGs, 1),
    .groups = "drop"
  ) 

head(site_summary)
# A tibble: 6 × 5
  site_ID n_samples total_MAGs total_HQ prop_HQ
  <chr>       <int>      <int>    <int>   <dbl>
1 BONA           10        256       18  0.0703
2 CLBJ            9        276       21  0.0761
3 CPER            9        122        2  0.0164
4 DCFS           12        258       14  0.0543
5 DEJU           17        474       29  0.0612
6 DSNY            6        116        2  0.0172
head(sample_summary)
# A tibble: 6 × 7
  genomicsSampleID    site_ID n_MAGs  n_HQ prop_HQ med_comp med_cont
  <chr>               <chr>    <int> <int>   <dbl>    <dbl>    <dbl>
1 BONA_001-O-20230710 BONA        32     1  0.0312     66.2     1.9 
2 BONA_002-O-20230711 BONA        31     2  0.0645     76.9     2.14
3 BONA_004-O-20230712 BONA        40     5  0.125      76.5     1.66
4 BONA_006-O-20230710 BONA        29     5  0.172      81.0     1.13
5 BONA_009-O-20230711 BONA        16     2  0.125      73.7     1.92
6 BONA_013-O-20230710 BONA        31     0  0          64.0     2.02

Proportion of High Quality MAGs per Sample

Explanation: Create an interactive graph to better visualize data, and be able to sort each sample per site because there are so many different samples.

plot_ly(
  data = sample_summary,
  x = ~n_MAGs,
  y = ~prop_HQ,
  color = ~site_ID,          # one trace per site (automatic grouping)
  size  = ~n_MAGs,           # size indicates reliability
  sizes = c(6, 30),         # size range
  text  = ~paste0(
    "sample: ", genomicsSampleID,
    "<br>site: ", site_ID,
    "<br>n_MAGs: ", n_MAGs,
    "<br>prop_HQ: ", sprintf("%.3f", prop_HQ)
  ),
  hoverinfo = "text",
  type = "scatter",
  mode = "markers"
) |>
  layout(
    title = "Proportion of High Quality vs Number of MAGs (Click Legend to Highlight a Site)",
    xaxis = list(title = "Number of MAGs in Sample"),
    yaxis = list(title = "Proportion HQ", tickformat = ".0%"),
    legend = list(itemclick = "toggleothers")  # click a legend item to show that site only
  )

Proportion of High Quality MAGs per Site

ggplot(site_summary, aes(x = site_ID, y = prop_HQ)) +
  geom_col(fill = "steelblue2") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 60, vjust = 1.3, hjust = 1.2)) +
  labs(
    x = "Site ID",
    y = "Proportion of HQ MAGs",
    title = "Proportion of High Quality MAGs at Each Site"
       )